*--------------------------------------------------------------------MERGE_AA_TAX_ROLL_MLS_DATA.DO---------------------------------------------------------------------------------
*This script merges formatted MLS sales data and tax roll data using street address concordances created using a combination of Excel Fuzzy Matching and Arc-GIS Geocoding.
*Sebastien Bradley
*11/26/13

version 12
clear all
capture cd "C:/Users/sjb355/Documents/Research/PropertyTaxes/Ann_Arbor/MLS"
capture log close
log using merge_AA_tax_roll_MLS_data_results.txt, text replace
set more off


*********************************************************************MERGE ON FUZZY MATCHING RESULTS**********************************************************************
*IMPORT CONCORDANCE DATA AND REFORMAT
insheet using ./AA_tax_roll_MLS_matches_Fuzzy.txt, tab names case
*replace I_match = "1" if I_match=="ambiguous" & regexm(I_match_note, "Does not exist")		/*I_match = 1 for subset of addresses with fuzzy match similarity score >= 0.9 and manually verified*/ 
*drop if I_match=="ambiguous" | I_match=="0"
drop if I_match==0
drop I_match

replace mls_address = subinword(mls_address, "STREET", "ST",1)		/*Not sure how this got through*/
rename mls_id ml
sort ml


*MERGE TO MLS DATA
merge 1:1 ml using AA_sales_MLS_data.dta, keep(match) nogen
foreach v in pre sev tv {
	rename `v' mls_`v'
}


*MERGE W/ TAX ROLL DATA AND REFORMAT
gen year = year(listdate)
sort pid year listdate
drop if (pid==pid[_n-1] & mls_address~=mls_address[_n-1] & mls_address~=pid_address) | (pid==pid[_n+1] & mls_address~=mls_address[_n+1] & mls_address~=pid_address)
drop if pid==pid[_n-1] & mls_address==mls_address[_n-1] & year==year[_n-1]

merge 1:1 pid year using ../AA_tax_roll.dta
do ../flag_repeat_buyers											/*Flag owners who either moved and/or owned multiple properties simultaneously in AA*/

*Define leads and lags of key tax parameters:
sort pid year
foreach v in pre sev tv saledate lastsaleamt {
	gen F_`v' = `v'[_n+1] if pid==pid[_n+1] & year==year[_n+1]-1
	gen L_`v' = `v'[_n-1] if pid==pid[_n-1] & year==year[_n-1]+1
}

drop if _merge~=3
drop _merge

*Flag possible mismatches based on TV at time of listing:
gen tmp_tv_ratio = tv/(mls_tv*1000)
summarize tmp_tv_ratio, detail

replace mls_tv = 86.006 if ml==3001823
replace mls_tv = 96.947 if ml==2608523

gen tmp_I_lag = (round(L_tv/(mls_tv*1000),0.01)==1.00)			/*Re-align TV, SEV, and PRE data to tax roll for early list dates within year (s.t. MLS figures truly reflect current values at time of sale).*/
gen tmp_I_lead = (F_tv==mls_tv*1000 & tv~=mls_tv*1000)			/*Enforce that TV, SEV, and PRE correspond to values at time of listing (not updated for time of sale)*/
foreach v in pre sev tv {
	replace mls_`v' = `v'/1000 if tmp_I_lag==1 | tmp_I_lead==1
}
replace tmp_tv_ratio = tv/(mls_tv*1000)
summarize tmp_tv_ratio, detail
sort tmp_tv_ratio pid year
order mls_address pid_address year tmp_tv_ratio tmp_I_lag tmp_I_lead mls_tv L_tv tv F_tv mls_sev L_sev sev F_sev pre *_pre
drop if tmp_tv_ratio~=1 & word(mls_address,1)~=word(pid_address,1)
drop tmp* 

replace mills = mills_pre*mls_pre/100 + mills_nonpre*(100-mls_pre)/100

save AA_tax_roll_MLS_Excel_matches.dta, replace


*********************************************************************MERGE ON GEOCODING RESULTS**********************************************************************
*IMPORT CONCORDANCE DATA
insheet using GIS_match_standard_addresses.txt, clear tab names case
replace mls_address = subinword(mls_address, "STREET", "ST",1)		/*Not sure how this got through*/
rename mls_id ml
sort ml

*MERGE TO MLS DATA
merge 1:1 ml using AA_sales_MLS_data.dta, keep(match) nogen
foreach v in pre sev tv {
	rename `v' mls_`v'
}

*Exclude condos with numbered units
drop if I_condo==1 & (regexm(trim(mls_address),"[0-9]$") | regexm(trim(pid_address),"[0-9]$"))
drop if I_condo==1 & (regexm(word(mls_address,-1),"^[0-9]") | regexm(word(pid_address,-1),"^[0-9]"))  
drop if I_condo==1 & (regexm(mls_address,"#") | regexm(pid_address,"#")) 
drop if I_condo==1 & (regexm(upper(mls_address)," UNIT ") | regexm(upper(pid_address)," UNIT ")) 

*MERGE W/ TAX ROLL DATA AND REFORMAT
gen year = year(listdate)
sort pid year listdate
drop if (pid==pid[_n-1] & mls_address~=mls_address[_n-1] & mls_address~=pid_address) | (pid==pid[_n+1] & mls_address~=mls_address[_n+1] & mls_address~=pid_address)
drop if pid==pid[_n-1] & mls_address==mls_address[_n-1] & year==year[_n-1]

merge 1:1 pid year using ../AA_tax_roll.dta
do ../flag_repeat_buyers											/*Flag owners who either moved and/or owned multiple properties simultaneously in AA*/

*Define leads and lags of key tax parameters:
sort pid year
foreach v in pre sev tv saledate lastsaleamt {
	gen F_`v' = `v'[_n+1] if pid==pid[_n+1] & year==year[_n+1]-1
	gen L_`v' = `v'[_n-1] if pid==pid[_n-1] & year==year[_n-1]+1
}

drop if _merge~=3
drop _merge

*Flag possible mismatches based on TV at time of listing:
gen tmp_tv_ratio = tv/(mls_tv*1000)
summarize tmp_tv_ratio, detail

replace mls_tv = 86.006 if ml==3001823
replace mls_tv = 96.947 if ml==2608523

gen tmp_I_lag = (round(L_tv/(mls_tv*1000),0.01)==1.00)			/*Re-align TV, SEV, and PRE data to tax roll for early list dates within year (s.t. MLS figures truly reflect current values at time of sale).*/
gen tmp_I_lead = (F_tv==mls_tv*1000 & tv~=mls_tv*1000)			/*Enforce that TV, SEV, and PRE correspond to values at time of listing (not updated for time of sale)*/
foreach v in pre sev tv {
	replace mls_`v' = `v'/1000 if tmp_I_lag==1 | tmp_I_lead==1
}
replace tmp_tv_ratio = tv/(mls_tv*1000)
summarize tmp_tv_ratio, detail
sort tmp_tv_ratio pid year
order mls_address pid_address year tmp_tv_ratio tmp_I_lag tmp_I_lead mls_tv L_tv tv F_tv mls_sev L_sev sev F_sev pre *_pre, first
drop if tmp_tv_ratio~=1 & word(mls_address,1)~=word(pid_address,1)
drop tmp* 

replace mills = mills_pre*mls_pre/100 + mills_nonpre*(100-mls_pre)/100

save AA_tax_roll_MLS_GIS_matches.dta, replace


*************************************************************************COMBINE MATCHING RESULTS AND TAKE UNION*********************************************
use AA_tax_roll_MLS_GIS_matches.dta, clear
append using AA_tax_roll_MLS_Excel_matches.dta
sort ml year pid
order renovage *_score match_address x y, last

duplicates drop mls_address-renovage, force

*In case of remaining duplicates, preserve geocoded results (with x,y coordinates).
drop if ((ml==ml[_n-1] & pid==pid[_n-1]) | (ml==ml[_n+1] & pid==pid[_n+1])) & similarity_score~=.
drop if ((ml==ml[_n-1] & pid~=pid[_n-1]) | (ml==ml[_n+1] & pid~=pid[_n+1])) & x==.


save AA_tax_roll_MLS_matches.dta, replace

log close


/**/
